Location Based Taxi Cab Analysis

pip install hdbscan
Collecting hdbscan
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.2/5.2 MB 27.9 MB/s eta 0:00:00
ents to build wheel ... etadata (pyproject.toml) ...  hdbscan)
  Using cached Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.25.2)
Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.11.4)
Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.2.2)
Requirement already satisfied: joblib>=1.0 in /usr/local/lib/python3.10/dist-packages (from hdbscan) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20->hdbscan) (3.3.0)
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... e=hdbscan-0.8.33-cp310-cp310-linux_x86_64.whl size=3039288 sha256=36420b850873ab1dae3d0b6393d8593e58dd04f1334b8d39ce451aef52492bd4
  Stored in directory: /root/.cache/pip/wheels/75/0b/3b/dc4f60b7cc455efaefb62883a7483e76f09d06ca81cf87d610
Successfully built hdbscan
Installing collected packages: cython, hdbscan
  Attempting uninstall: cython
    Found existing installation: Cython 3.0.9
    Uninstalling Cython-3.0.9:
      Successfully uninstalled Cython-3.0.9
Successfully installed cython-0.29.37 hdbscan-0.8.33
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import pandas as pd
import numpy as np

from tqdm import tqdm
import csv
import random


from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import silhouette_score

from ipywidgets import interactive
from collections import defaultdict
import hdbscan
import folium
import re
import numpy as np
from heapq import heappush, heappop
cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
        '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
        '#000075', '#808080']*10

Exploratory Data

df = pd.read_csv('/content/drive/MyDrive/project-02/taxi_data.csv')
df.head()
LON LAT NAME
0 28.17858 -25.73882 11th Street Taxi Rank
1 28.17660 -25.73795 81 Bazaar Street Taxi Rank
2 27.83239 -26.53722 Adams Road Taxi Rank
3 28.12514 -26.26666 Alberton City Mall Taxi Rank
4 28.10144 -26.10567 Alexandra Main Taxi Rank
df.duplicated(subset=['LON', 'LAT']).values.any()
True
df.isna().values.any()
True
print(f'Before dropping NaNs and dupes\t:\tdf.shape = {df.shape}')
df.dropna(inplace=True)
df.drop_duplicates(subset=['LON', 'LAT'], keep='first', inplace=True)
print(f'After dropping NaNs and dupes\t:\tdf.shape = {df.shape}')
Before dropping NaNs and dupes	:	df.shape = (838, 3)
After dropping NaNs and dupes	:	df.shape = (823, 3)
df.head()
LON LAT NAME
0 28.17858 -25.73882 11th Street Taxi Rank
1 28.17660 -25.73795 81 Bazaar Street Taxi Rank
2 27.83239 -26.53722 Adams Road Taxi Rank
3 28.12514 -26.26666 Alberton City Mall Taxi Rank
4 28.10144 -26.10567 Alexandra Main Taxi Rank
X = np.array(df[['LON', 'LAT']], dtype='float64')
plt.scatter(X[:,0], X[:,1], alpha=0.2, s=50)
<matplotlib.collections.PathCollection at 0x7b7caaa84910>

Visualizing Geographical Data

m = folium.Map(location=[df.LAT.mean(), df.LON.mean()], zoom_start=9,
               tiles='Stamen Toner')

for _, row in df.iterrows():
    folium.CircleMarker(
        location=[row.LAT, row.LON],
        radius=5,
        popup=re.sub(r'[^a-zA-Z ]+', '', row.NAME),
        color='#1787FE',
        fill=True,
        fill_colour='#1787FE'
    ).add_to(m)
m
Make this Notebook Trusted to load map: File -> Trust Notebook

Clustering Strength / Performance Metric

X_blobs, _ = make_blobs(n_samples=1000, centers=10, n_features=2,
                        cluster_std=0.5, random_state=4)
plt.scatter(X_blobs[:,0], X_blobs[:,1], alpha=0.2)
<matplotlib.collections.PathCollection at 0x7b7ca81f36d0>

Uploade sample_clusters.npy file

from google.colab import files
uploaded = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving sample_clusters.npy to sample_clusters.npy
class_predictions = np.load('sample_clusters.npy')
unique_clusters = np.unique(class_predictions)
for unique_cluster in unique_clusters:
    X = X_blobs[class_predictions==unique_cluster]
    plt.scatter(X[:,0], X[:,1], alpha=0.2, c=cols[unique_cluster])

silhouette_score(X_blobs, class_predictions)
0.6657220862867241

upload sample_clusters_improved.npy file

from google.colab import files
uploaded = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving sample_clusters_improved.npy to sample_clusters_improved.npy
class_predictions = np.load('sample_clusters_improved.npy')
unique_clusters = np.unique(class_predictions)
for unique_cluster in unique_clusters:
    X = X_blobs[class_predictions==unique_cluster]
    plt.scatter(X[:,0], X[:,1], alpha=0.2, c=cols[unique_cluster])

K-Means Clustering

X_blobs, _ = make_blobs(n_samples=1000, centers=50,
                        n_features=2, cluster_std=1, random_state=4)
data = defaultdict(dict)
for x in range(1,21):
    model = KMeans(n_clusters=3, random_state=17,
                   max_iter=x, n_init=1).fit(X_blobs)

    data[x]['class_predictions'] = model.predict(X_blobs)
    data[x]['centroids'] = model.cluster_centers_
    data[x]['unique_classes'] = np.unique(class_predictions)
def f(x):
    class_predictions = data[x]['class_predictions']
    centroids = data[x]['centroids']
    unique_classes = data[x]['unique_classes']

    for unique_class in unique_classes:
            plt.scatter(X_blobs[class_predictions==unique_class][:,0],
                        X_blobs[class_predictions==unique_class][:,1],
                        alpha=0.3, c=cols[unique_class])
    plt.scatter(centroids[:,0], centroids[:,1], s=200, c='#000000', marker='v')
    plt.ylim([-15,15]); plt.xlim([-15,15])
    plt.title('How K-Means Clusters')

interactive_plot = interactive(f, x=(1, 20))
output = interactive_plot.children[-1]
output.layout.height = '350px'
interactive_plot
{"model_id":"ae33f908a9cc44e0a236216e54c90c91","version_major":2,"version_minor":0}

X = np.array(df[['LON', 'LAT']], dtype='float64')
k = 70
model = KMeans(n_clusters=k, random_state=17).fit(X)
class_predictions = model.predict(X)
df[f'CLUSTER_kmeans{k}'] = class_predictions
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
df.head()
LON LAT NAME CLUSTER_kmeans70
0 28.17858 -25.73882 11th Street Taxi Rank 1
1 28.17660 -25.73795 81 Bazaar Street Taxi Rank 1
2 27.83239 -26.53722 Adams Road Taxi Rank 9
3 28.12514 -26.26666 Alberton City Mall Taxi Rank 8
4 28.10144 -26.10567 Alexandra Main Taxi Rank 4
def create_map(df, cluster_column):
    m = folium.Map(location=[df.LAT.mean(), df.LON.mean()], zoom_start=9, tiles='Stamen Toner')

    for _, row in df.iterrows():

        if row[cluster_column] == -1:
            cluster_colour = '#000000'
        else:
            cluster_colour = cols[row[cluster_column]]

        folium.CircleMarker(
            location= [row['LAT'], row['LON']],
            radius=5,
            popup = folium.Popup(row[cluster_column]),
            #popup= row[cluster_column],
            color=cluster_colour,
            fill=True,
            fill_color=cluster_colour
        ).add_to(m)

    return m

m = create_map(df, 'CLUSTER_kmeans70')
print(f'K={k}')
print(f'Silhouette Score: {silhouette_score(X, class_predictions)}')

m.save('kmeans_70.html')
K=70
Silhouette Score: 0.6527069281516621

1) By using Quadtree

class QuadTree:
    def __init__(self, boundary, capacity):
        self.boundary = boundary  # Boundary of the quadtree
        self.capacity = capacity  # Maximum number of points in a node
        self.points = []  # List of points in the quadtree node
        self.subdivided = False  # Whether the quadtree node has been subdivided

    def insert(self, point):
        # Insert a point into the quadtree
        if not self.boundary.contains(point):
            return False  # Point is outside the quadtree boundary
        if len(self.points) < self.capacity:
            self.points.append(point)
            return True
        else:
            if not self.subdivided:
                self.subdivide()
            # Insert the point into one of the subquadrants
            for child in self.children:
                if child.insert(point):
                    return True

    def subdivide(self):
        # Subdivide the quadtree into four quadrants
        x = self.boundary.x
        y = self.boundary.y
        w = self.boundary.width / 2
        h = self.boundary.height / 2
        nw = Rectangle(x - w/2, y - h/2, w, h)
        self.children = [
            QuadTree(nw, self.capacity),
            QuadTree(Rectangle(x + w/2, y - h/2, w, h), self.capacity),
            QuadTree(Rectangle(x - w/2, y + h/2, w, h), self.capacity),
            QuadTree(Rectangle(x + w/2, y + h/2, w, h), self.capacity)
        ]
        self.subdivided = True

class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y
class Rectangle:
    def __init__(self, x, y, width, height):
        self.x = x
        self.y = y
        self.width = width
        self.height = height
def read_points_from_csv(file_path):
    points = []
    with open(file_path, 'r') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # Skip the header row
        for idx, row in enumerate(csv_reader, start=2):  # start counting from 2 to account for header
            try:
                x, y = map(float, row[:2])
                points.append(Point(x, y))
            except ValueError:
                print(f"Error converting row {idx} to float: {row}")
    return points

# Perform K-Means clustering using QuadTree
def k_means_with_quadtree(points, k, iterations=100):
    # Initialize cluster centroids randomly
    centroids = random.sample(points, k)
    quadtree_boundary = Rectangle(min(p.x for p in points), min(p.y for p in points),
                                  max(p.x for p in points) - min(p.x for p in points),
                                  max(p.y for p in points) - min(p.y for p in points))
    quadtree = QuadTree(quadtree_boundary, len(points))

    # Perform K-Means iterations
    for _ in range(iterations):
        # Assign points to clusters
        clusters = [[] for _ in range(k)]
        for point in points:
            min_dist = float('inf')
            closest_centroid = None
            for i, centroid in enumerate(centroids):
                dist = np.sqrt((point.x - centroid.x)**2 + (point.y - centroid.y)**2)
                if dist < min_dist:
                    min_dist = dist
                    closest_centroid = i
            clusters[closest_centroid].append(point)

        # Update centroids
        for i in range(k):
            if clusters[i]:
                centroids[i] = Point(np.mean([p.x for p in clusters[i]]), np.mean([p.y for p in clusters[i]]))

    # Calculate Silhouette Score
    all_points = [point for cluster in clusters for point in cluster]
    labels = [i for i, cluster in enumerate(clusters) for _ in cluster]
    silhouette_avg = silhouette_score([np.array([p.x, p.y]) for p in all_points], labels)
    print("Silhouette Score:", silhouette_avg)

    return clusters

# Example usage
file_path = '/content/drive/MyDrive/project-02/taxi_data.csv'
points = read_points_from_csv(file_path)
k = 3  # Number of clusters

# Perform K-Means clustering using QuadTree
clusters = k_means_with_quadtree(points, k)
for i, cluster in enumerate(clusters):
    print(f'Cluster {i+1}: {len(cluster)} points')
Error converting row 839 to float: ['', '', '']
Silhouette Score: 0.43370387764528767
Cluster 1: 264 points
Cluster 2: 148 points
Cluster 3: 425 points
m
Make this Notebook Trusted to load map: File -> Trust Notebook

Final Score by Using Quadtree

best_silhouette, best_k = -1, 0

for k in tqdm(range(2, 100)):
    model = KMeans(n_clusters=k, random_state=1).fit(X)
    class_predictions = model.predict(X)

    curr_silhouette = silhouette_score(X, class_predictions)
    if curr_silhouette > best_silhouette:
        best_k = k
        best_silhouette = curr_silhouette

print(f'K={best_k}')
print(f'Silhouette Score: {best_silhouette}')
  0%|          | 0/98 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
  2%|▏         | 2/98 [00:00<00:09, 10.56it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
  4%|▍         | 4/98 [00:00<00:08, 11.31it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
  6%|▌         | 6/98 [00:00<00:08, 11.07it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
  8%|▊         | 8/98 [00:00<00:08, 10.74it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 10%|█         | 10/98 [00:00<00:08, 10.49it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 12%|█▏        | 12/98 [00:01<00:09,  8.84it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 13%|█▎        | 13/98 [00:01<00:10,  8.26it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 14%|█▍        | 14/98 [00:01<00:10,  7.99it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 15%|█▌        | 15/98 [00:01<00:10,  7.82it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 16%|█▋        | 16/98 [00:01<00:10,  7.47it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 17%|█▋        | 17/98 [00:01<00:11,  7.15it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 18%|█▊        | 18/98 [00:02<00:11,  6.86it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 19%|█▉        | 19/98 [00:02<00:11,  6.62it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 20%|██        | 20/98 [00:02<00:12,  6.48it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 21%|██▏       | 21/98 [00:02<00:12,  6.33it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 22%|██▏       | 22/98 [00:02<00:11,  6.61it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 23%|██▎       | 23/98 [00:02<00:11,  6.40it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 24%|██▍       | 24/98 [00:03<00:11,  6.68it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 26%|██▌       | 25/98 [00:03<00:11,  6.27it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 27%|██▋       | 26/98 [00:03<00:11,  6.30it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 28%|██▊       | 27/98 [00:03<00:10,  6.59it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 29%|██▊       | 28/98 [00:03<00:11,  6.26it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 30%|██▉       | 29/98 [00:03<00:11,  6.03it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 31%|███       | 30/98 [00:04<00:10,  6.48it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 32%|███▏      | 31/98 [00:04<00:10,  6.16it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 33%|███▎      | 32/98 [00:04<00:11,  5.80it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 34%|███▎      | 33/98 [00:04<00:11,  5.89it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 35%|███▍      | 34/98 [00:04<00:10,  5.82it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 36%|███▌      | 35/98 [00:04<00:11,  5.58it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 37%|███▋      | 36/98 [00:05<00:10,  5.72it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 38%|███▊      | 37/98 [00:05<00:10,  5.73it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 39%|███▉      | 38/98 [00:05<00:10,  5.92it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 40%|███▉      | 39/98 [00:05<00:10,  5.83it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 41%|████      | 40/98 [00:05<00:09,  5.80it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 42%|████▏     | 41/98 [00:05<00:10,  5.62it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 43%|████▎     | 42/98 [00:06<00:09,  5.91it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 44%|████▍     | 43/98 [00:06<00:09,  5.53it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 45%|████▍     | 44/98 [00:06<00:09,  5.68it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 46%|████▌     | 45/98 [00:06<00:08,  5.96it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 47%|████▋     | 46/98 [00:06<00:08,  6.21it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 48%|████▊     | 47/98 [00:06<00:07,  6.50it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 49%|████▉     | 48/98 [00:07<00:07,  6.64it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 50%|█████     | 49/98 [00:07<00:07,  6.42it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 51%|█████     | 50/98 [00:07<00:08,  5.63it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 52%|█████▏    | 51/98 [00:07<00:08,  5.32it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 53%|█████▎    | 52/98 [00:07<00:08,  5.38it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 54%|█████▍    | 53/98 [00:08<00:08,  5.48it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 55%|█████▌    | 54/98 [00:08<00:07,  5.74it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 56%|█████▌    | 55/98 [00:08<00:07,  5.51it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 57%|█████▋    | 56/98 [00:08<00:07,  5.58it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 58%|█████▊    | 57/98 [00:08<00:07,  5.84it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 59%|█████▉    | 58/98 [00:09<00:08,  4.56it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 60%|██████    | 59/98 [00:10<00:18,  2.10it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 61%|██████    | 60/98 [00:11<00:23,  1.59it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 62%|██████▏   | 61/98 [00:11<00:23,  1.55it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 63%|██████▎   | 62/98 [00:11<00:17,  2.00it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 64%|██████▍   | 63/98 [00:12<00:14,  2.36it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 65%|██████▌   | 64/98 [00:12<00:12,  2.81it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 66%|██████▋   | 65/98 [00:12<00:12,  2.75it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 67%|██████▋   | 66/98 [00:13<00:11,  2.70it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 68%|██████▊   | 67/98 [00:14<00:20,  1.50it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 69%|██████▉   | 68/98 [00:14<00:17,  1.68it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 70%|███████   | 69/98 [00:15<00:14,  1.96it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 71%|███████▏  | 70/98 [00:16<00:18,  1.53it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 72%|███████▏  | 71/98 [00:16<00:16,  1.66it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 73%|███████▎  | 72/98 [00:17<00:15,  1.70it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 74%|███████▍  | 73/98 [00:17<00:13,  1.86it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 76%|███████▌  | 74/98 [00:19<00:19,  1.24it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 77%|███████▋  | 75/98 [00:20<00:21,  1.09it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 78%|███████▊  | 76/98 [00:20<00:18,  1.18it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 79%|███████▊  | 77/98 [00:21<00:13,  1.52it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 80%|███████▉  | 78/98 [00:21<00:10,  1.91it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 81%|████████  | 79/98 [00:21<00:08,  2.36it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 82%|████████▏ | 80/98 [00:22<00:07,  2.39it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 83%|████████▎ | 81/98 [00:23<00:10,  1.56it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 84%|████████▎ | 82/98 [00:24<00:11,  1.42it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 85%|████████▍ | 83/98 [00:24<00:10,  1.45it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 86%|████████▌ | 84/98 [00:24<00:07,  1.82it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 87%|████████▋ | 85/98 [00:25<00:05,  2.21it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 88%|████████▊ | 86/98 [00:25<00:04,  2.60it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 89%|████████▉ | 87/98 [00:25<00:03,  2.88it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 90%|████████▉ | 88/98 [00:25<00:03,  3.20it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 91%|█████████ | 89/98 [00:26<00:02,  3.52it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 92%|█████████▏| 90/98 [00:26<00:02,  3.52it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 93%|█████████▎| 91/98 [00:26<00:01,  3.66it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 94%|█████████▍| 92/98 [00:26<00:01,  3.64it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 95%|█████████▍| 93/98 [00:27<00:01,  3.89it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 96%|█████████▌| 94/98 [00:27<00:01,  3.75it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 97%|█████████▋| 95/98 [00:27<00:00,  3.87it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 98%|█████████▊| 96/98 [00:27<00:00,  3.95it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
 99%|█████████▉| 97/98 [00:28<00:00,  4.06it/s]/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
100%|██████████| 98/98 [00:28<00:00,  3.45it/s]
K=99
Silhouette Score: 0.708499624089545

2) A* Algo



class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __repr__(self):
        return f"({self.x}, {self.y})"

class Cluster:
    def __init__(self, center):
        self.center = center
        self.points = [center]

def euclidean_distance(point1, point2):
    return np.sqrt((point1.x - point2.x)**2 + (point1.y - point2.y)**2)

def a_star_clustering(points, k):
    # Initialize clusters with random centers
    clusters = [Cluster(points[i]) for i in np.random.choice(len(points), k, replace=False)]

    # Create a priority queue for the frontier
    frontier = []
    for cluster_index, cluster in enumerate(clusters):
        for point in points:
            distance = euclidean_distance(cluster.center, point)
            heappush(frontier, (distance, cluster_index, point))

    # Perform A* search
    while frontier and len(clusters) < k:
        _, closest_cluster_index, point = heappop(frontier)
        closest_cluster = clusters[closest_cluster_index]
        closest_cluster.points.append(point)
        closest_cluster.center = Point(np.mean([p.x for p in closest_cluster.points]), np.mean([p.y for p in closest_cluster.points]))

        # Update priorities for points in the frontier
        for cluster_index, cluster in enumerate(clusters):
            distance = euclidean_distance(cluster.center, point)
            heappush(frontier, (distance, cluster_index, point))

    return clusters

# Example usage
points = [Point(1, 2), Point(3, 4), Point(5, 6), Point(7, 8), Point(9, 10)]
k = 2  # Number of clusters

clusters = a_star_clustering(points, k)
for i, cluster in enumerate(clusters):
    print(f'Cluster {i+1}: Center={cluster.center}, Points={cluster.points}')
Cluster 1: Center=(1, 2), Points=[(1, 2)]
Cluster 2: Center=(5, 6), Points=[(5, 6)]

3) DBSCAN

# code for indexing out certain values
dummy = np.array([-1, -1, -1, 2, 3, 4, 5, -1])

new = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(dummy)])
model = DBSCAN(eps=0.01, min_samples=5).fit(X)
class_predictions = model.labels_

df['CLUSTERS_DBSCAN'] = class_predictions
m = create_map(df, 'CLUSTERS_DBSCAN')


print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')

print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')
Number of clusters found: 51
Number of outliers found: 289
Silhouette ignoring outliers: 0.923213824804447
Silhouette outliers as singletons: 0.5667489347252957
m
Make this Notebook Trusted to load map: File -> Trust Notebook

4) HDBSCAN By ARIMA

import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import hdbscan

# Example time series data
time_series_data = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2020', periods=1000))

# Preprocess the time series data with ARIMA (Example only, adjust as needed)
model = ARIMA(time_series_data, order=(5,1,0))
results = model.fit()
forecast_values = results.forecast(steps=100)  # Example forecast for the next 100 steps

# Convert time series data into feature vectors (Example only, adjust as needed)
# You might extract features like mean, variance, trend, etc.
feature_vectors = []
for i in range(len(time_series_data)):
    # Example: Using mean and variance as features
    mean_value = time_series_data[:i+1].mean()
    var_value = time_series_data[:i+1].var()
    feature_vectors.append([mean_value, var_value])

# Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
cluster_labels = clusterer.fit_predict(feature_vectors)

# Explore the clusters
unique_labels = np.unique(cluster_labels)
for label in unique_labels:
    cluster_points = time_series_data[cluster_labels == label]
    print(f'Cluster {label}: {cluster_points}')
Cluster -1: 2020-01-01   -0.704216
2020-01-02    1.399119
2020-01-03    0.363188
2020-01-04    0.050744
2020-01-05    0.554875
2020-01-06   -0.239715
2020-01-07    1.514180
2020-01-08   -2.309120
2020-01-09   -0.144164
2020-01-10    0.042631
2020-01-11    0.759298
2020-01-12   -0.355809
2020-01-13    0.524350
2020-01-14    0.131778
2020-01-15    0.624274
2020-01-16   -1.818769
2020-01-17    0.244839
2020-01-18    0.152294
2020-01-21   -1.585520
2020-01-22   -1.720975
2020-01-23   -0.396916
2020-01-24   -1.351961
2020-01-25   -1.071558
2020-01-26   -1.555756
2020-01-27    1.398499
2020-01-28    0.799952
2020-01-29   -0.025454
2020-01-30    1.105543
2020-01-31   -0.156894
2020-02-01   -2.183866
2020-02-02    0.087394
2020-02-03    1.468162
2020-02-04   -0.684677
2020-02-05   -1.251450
2020-02-06    1.804672
2020-02-07   -1.185606
2020-02-08    1.272429
2020-02-09   -0.492270
2020-02-10   -1.532597
2020-02-11    0.684331
2020-02-12    0.265645
2020-02-13    0.597446
2020-02-14   -0.118132
2020-02-15   -0.020143
2020-02-16   -0.454072
2020-02-17   -1.045907
2020-02-18   -0.105658
2020-02-19    1.590500
2020-02-21    1.218901
2020-02-22    0.662205
2020-03-01    0.697293
2020-03-03    1.715116
2020-03-04   -1.167126
2020-03-06    0.493523
2020-03-19   -0.584890
2020-03-20    0.571006
2020-03-21    0.638134
2020-03-22   -0.461349
2020-03-23    0.368731
2020-04-05    1.394731
dtype: float64
Cluster 0: 2020-02-20    1.642020
2020-02-23   -1.171977
2020-02-24   -1.185857
2020-02-25   -0.798569
2020-02-26    0.040349
2020-02-27    0.800950
2020-02-28    0.993745
2020-02-29    0.142409
2020-03-05   -1.466236
2020-03-07   -2.203236
2020-03-08    0.057498
2020-03-09   -0.868469
2020-03-10   -0.132699
2020-03-11    1.853238
2020-03-12   -0.868211
2020-03-13   -0.681443
2020-03-14    0.040114
2020-03-15    1.319954
2020-03-16   -1.285173
2020-03-17   -0.228192
2020-03-18   -0.780250
dtype: float64
Cluster 1: 2020-01-19   -1.930638
2020-01-20    0.918921
2020-03-02   -0.302912
2020-03-24    0.015326
2020-03-25    0.195708
                ...   
2022-09-22   -0.426785
2022-09-23   -0.325458
2022-09-24    0.412711
2022-09-25   -0.948460
2022-09-26   -1.229683
Length: 919, dtype: float64
model = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=2,
                        cluster_selection_epsilon=0.01)
#min_cluster_size
#min_samples
#cluster_slection_epsilon

class_predictions = model.fit_predict(X)
df['CLUSTER_HDBSCAN'] = class_predictions
m = create_map(df, 'CLUSTER_HDBSCAN')

print(f'Number of clusters found: {len(np.unique(class_predictions))-1}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')

print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')

m
Number of clusters found: 66
Number of outliers found: 102
Silhouette ignoring outliers: 0.7670504354649553
Silhouette outliers as singletons: 0.6389924831127337
Make this Notebook Trusted to load map: File -> Trust Notebook
hdbscan.HDBSCAN?
classifier = KNeighborsClassifier(n_neighbors=1)
df_train = df[df.CLUSTER_HDBSCAN!=-1]
df_predict = df[df.CLUSTER_HDBSCAN==-1]
X_train = np.array(df_train[['LON', 'LAT']], dtype='float64')
y_train = np.array(df_train['CLUSTER_HDBSCAN'])

X_predict = np.array(df_predict[['LON', 'LAT']], dtype='float64')
classifier.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
predictions = classifier.predict(X_predict)
df['CLUSTER_hybrid'] = df['CLUSTER_HDBSCAN']
df.loc[df.CLUSTER_HDBSCAN==-1, 'CLUSTER_hybrid'] = predictions
m = create_map(df, 'CLUSTER_hybrid')
m
Make this Notebook Trusted to load map: File -> Trust Notebook
df['CLUSTER_hybrid'].value_counts().plot.hist(bins=70, alpha=0.4,
                                              label='Hybrid')
df['CLUSTER_kmeans70'].value_counts().plot.hist(bins=70, alpha=0.4,
                                               label='K-Means (70)')
plt.legend()
plt.title('Comparing Hybrid and K-Means Approaches')
plt.xlabel('Cluster Sizes')
Text(0.5, 0, 'Cluster Sizes')